In [ ]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings

import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from matplotlib.cm import ScalarMappable
from umap import UMAP

sys.path.append("../")

data_path = "../data"

from models.cx_groups import Cx_groups
from models.pca_clients import Cx_groups_post_pca 
from models.easy_pca import Easy_pca
from scripts.optimizers_mp import k_means_optimizer
from scripts.df_actions import remove_outliers

pio.renderers.default = "notebook_connected"

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

Taking into account what RFM approach taught us and the clusters we extracted from the data :

1 : Statistics considering the clusters obtained via RFM¶

2 : Delta Order / Delivery and its effects on customers' satisfaction.¶

3 : Delta dist, is it a determining factor in the choice made by clients ?¶

4 : PCA on selected variables for classification¶

5 : Modelisations and early conclusions¶


In [ ]:
olist_clients_dataset = "../final_datasets/olist_customers.pkl"
In [ ]:
df_cx = pd.read_pickle(olist_clients_dataset)

df_cx.describe()
Out[ ]:
customer_uid recency frequency monetary num_orders cluster_kmeans_4 cluster_DBSCAN lat lon rating_avg rating_ratio comment_ratio delta_delivery expected_reality distance_cx_seller
count 95922.000000 9.592200e+04 95922.000000 95922.000000 95922.000000 95922.000000 95922.000000 95653.000000 95653.000000 95211.000000 95922.000000 95922.000000 90212 90212 94683.000000
mean 48050.027408 2.488008e+07 0.144087 164.790904 1.034872 1.581326 0.040502 -21.186477 -46.175050 4.085653 0.992588 0.410418 12 days 10:59:52.732275085 -12 days +02:32:54.229592516 602.671260
std 27740.300523 1.323897e+07 0.106734 227.880166 0.214573 1.210225 3.080952 5.628223 4.066423 1.341295 0.085775 0.490193 8 days 19:34:30.557032786 9 days 08:37:12.484771075 596.799614
min 1.000000 0.000000e+00 0.000000 0.000000 1.000000 0.000000 0.000000 -36.605374 -72.666706 1.000000 0.000000 0.000000 0 days 12:48:07 -51 days +00:00:00 0.000000
25% 24028.250000 1.415290e+07 0.071299 62.370000 1.000000 0.000000 0.000000 -23.588447 -48.110368 4.000000 1.000000 0.000000 6 days 18:15:44.750000 -17 days +00:00:00 189.232300
50% 48048.500000 2.322363e+07 0.105751 107.220000 1.000000 2.000000 0.000000 -22.926003 -46.631276 5.000000 1.000000 0.000000 10 days 04:58:03 -12 days +00:00:00 434.458400
75% 72077.750000 3.431603e+07 0.172824 182.100000 1.000000 3.000000 0.000000 -20.134916 -43.605058 5.000000 1.000000 1.000000 15 days 17:00:39.250000 -7 days +00:00:00 799.043250
max 96095.000000 6.677370e+07 1.701609 13664.080000 17.000000 3.000000 255.000000 42.184003 -8.577855 5.000000 1.000000 1.000000 103 days 21:34:37 55 days 00:00:00 8736.947600
In [ ]:
df_cx.head()
Out[ ]:
customer_uid most_ancient_order_dt most_recent_order_dt recency frequency monetary num_orders cluster_kmeans_4 cluster_DBSCAN k_cluster_name ... order_id_list rating_avg rating_ratio comment_ratio has_rated has_commented delta_delivery expected_reality early_on_time distance_cx_seller
0 1 2017-05-16 15:05:35 2017-05-16 15:05:35 44850283.0 0.053939 146.87 1 3 0 early_majority ... [88493] 4.0 1.0 0.0 True False 8 days 19:30:00 -11 days True 346.9776
1 2 2018-01-12 20:48:24 2018-01-12 20:48:24 24007314.0 0.100769 335.48 1 3 0 early_majority ... [90419] 5.0 1.0 0.0 True False 16 days 15:52:55 -8 days True 413.9531
2 3 2018-05-19 16:07:45 2018-05-19 16:07:45 13051353.0 0.185360 157.73 1 3 0 early_majority ... [22558] 5.0 1.0 0.0 True False 26 days 01:51:06 1 days False 29.5741
3 4 2018-03-13 16:06:38 2018-03-13 16:06:38 18840220.0 0.128406 173.30 1 1 0 investors ... [32181] 5.0 1.0 0.0 True False 14 days 23:57:47 -13 days True 19.3538
4 5 2018-07-29 09:51:30 2018-07-29 09:51:30 6939528.0 0.348612 252.25 1 0 0 late_majority ... [69903] 5.0 1.0 1.0 True True 11 days 11:04:18 -6 days True 219.7266

5 rows × 22 columns

In [ ]:
df_cx.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95922 entries, 0 to 95921
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype          
---  ------                 --------------  -----          
 0   customer_uid           95922 non-null  uint32         
 1   most_ancient_order_dt  95922 non-null  datetime64[ns] 
 2   most_recent_order_dt   95922 non-null  datetime64[ns] 
 3   recency                95922 non-null  float64        
 4   frequency              95922 non-null  float64        
 5   monetary               95922 non-null  float64        
 6   num_orders             95922 non-null  uint8          
 7   cluster_kmeans_4       95922 non-null  uint8          
 8   cluster_DBSCAN         95922 non-null  uint8          
 9   k_cluster_name         95922 non-null  object         
 10  lat                    95653 non-null  float64        
 11  lon                    95653 non-null  float64        
 12  order_id_list          95922 non-null  object         
 13  rating_avg             95211 non-null  float64        
 14  rating_ratio           95922 non-null  float64        
 15  comment_ratio          95922 non-null  float64        
 16  has_rated              95922 non-null  bool           
 17  has_commented          95922 non-null  bool           
 18  delta_delivery         90212 non-null  timedelta64[ns]
 19  expected_reality       90212 non-null  timedelta64[ns]
 20  early_on_time          95922 non-null  bool           
 21  distance_cx_seller     94683 non-null  float64        
dtypes: bool(3), datetime64[ns](2), float64(9), object(2), timedelta64[ns](2), uint32(1), uint8(3)
memory usage: 11.9+ MB

1 : Statistics considering the clusters obtained via RFM.¶

1.1 : Ratings across all the dataset:¶

Goals :

 This will help to visualize how many people have left at least one Review, and if they commented it. We will see the involvement of customers, first across all the dataset. We call calculate the avg of all ratings to provide a reference/comparison point for ratings per cluster

Method :

 We will first assess the amount of customers that have rated and/or commented. We will calculate the average ratio for ratings and comments. It will be interesting to see, first, those metrics on the dataset, then applied to our first successful segmentation (k-means w/ k=4)

In [ ]:
# Checking if people can comment without rating : 

commenters = df_cx[df_cx["has_commented"] == True]
print(f"{len(commenters[commenters['rating_avg'] == np.nan])}")

del commenters  # Flush
0
In [ ]:
def addlabels(x, y):
    for i in range(len(x)):
        plt.text(i, y[i], f"{y[i]}", ha="center", bbox=dict(facecolor="white", alpha=1))
In [ ]:
# No they cant, okay so cx who have posted a comment have to rate the order.

ratings_avg_dswide = np.average(df_cx[df_cx["rating_avg"].notna()]["rating_avg"].values.tolist())
ratings_ratio_dswide = np.average(df_cx["rating_ratio"])
comment_ratio_dswide = np.average(df_cx["comment_ratio"])

rating_poster_dswide = len(df_cx[df_cx["has_rated"] == True])
comment_poster_dswide = len(df_cx[df_cx["has_commented"] == True])
no_rating = len(df_cx[df_cx["has_rated"] == False])
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(10, 6),
    dpi=pc_dpi,
)

bars = {"rating_poster": rating_poster_dswide, "comment_and_rate": comment_poster_dswide, "no_review": no_rating}

cmap_one = ["#000331", "navy", "red"]

ax1.bar(x=list(bars.keys()), height=list(bars.values()), color=cmap_one)

###
# Titles/Lables
addlabels(x=[cat for cat in bars.keys()], y=[pop for pop in bars.values()])
ax1.set_ylabel("Population of group")
ax1.set_xlabel("Group")
ax1.tick_params(left=False)
fig.suptitle("Amount of users by their method of rating")
#
###

fig.tight_layout()
plt.show()

 By design (Olist sharing mostly orders which have been subject to customer review) or by accident (very small non zero chance), the crushing majority of customers have rated their orders at least once (95211 out of 95922, or about 99.26% of customers).

 There is also a considerable amount of customers who have assorted their reviews with a comment (You need to rate the order to comment it, at least, in this dataset, no commented order was unrated) : 39700 out of 95922 clients, which is about 41.39% of the total population.

 Finally, a crushing minority of clients (711, which represents 0.74% of the dataset) have neither rated nor commented any of their orders. Due to the nature of the data, we can theorize that this concerns orders that have not been yet completed (delivered ?) at the time of SQL dump. We will include this in our report and inquire about those 711 customers.


 We need to visualize at which ratio customers have reviewed and commented in average, but since most (but not all) customers have only placed one order, we can expect this number to quite close to 1, for ratings at least.

In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(10, 6),
    dpi=pc_dpi,
)

bars_ratios_dswide = {"review_ratio": ratings_ratio_dswide, "comment_ratio": comment_ratio_dswide}

cmap_one = ["#000331", "navy"]

ax1.bar(x=list(bars_ratios_dswide.keys()), height=list(bars_ratios_dswide.values()), color=cmap_one)

###
# Titles/Lables
addlabels(x=[cat for cat in bars_ratios_dswide.keys()], y=[pop for pop in bars_ratios_dswide.values()])
ax1.set_ylabel("Ratio (0-1)")
ax1.tick_params(left=False)
fig.suptitle("Average rating and comment ratio, datasetwide")
#
###

fig.tight_layout()
plt.show()

 That was expected, this is in line with what we saw on the previous chart. Average ratio of ratings is close to 1 and comment ratio close to 0.4139 (our 41.39% from the comment group earlier).

 We can conclude this analysis of the review ratio by stating that, being that close to 1 and seemingly almost mandatory for the order to be in the dataset, is of no use since it cannot efficiently help our clustering. We later drop this variable (keeping the boolean for now), but we suggest its use in the model chosen at the end of this analysis. According to this article : this article, in 2019, 47% of e-commerce customers leave a review each month. It would be necessary to see if that proportion is similar in the whole Olist database or it if keeps being that high.



 Let's take a look at the rating distribution in the dataset and the average rating left by customers.

In [ ]:
## Rounding up to 1/1.5/2/2.5/3/3.5/4/4.5/5
rounded_list = []

og_list = df_cx[df_cx["rating_avg"].notna()]["rating_avg"].values.tolist()

for rating in og_list:
    rounded_list.append(round(rating * 2) / 2)

uniques = []
for rating in rounded_list:
    if rating not in uniques:
        uniques.append(rating)

uniques.sort()

print(uniques)

# Nice, feels like im reinventing the wheels there but that works
[1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
In [ ]:
ratings_dswide = dict.fromkeys(uniques)

for rating in ratings_dswide:
    ratings_dswide[rating] = rounded_list.count(rating)
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(10, 6),
    dpi=pc_dpi,
)

colors = ["black", "dimgray", "darkred", "red", "yellow", "steelblue", "royalblue", "navy", "#000331"]

ratings = [str(number) for number in np.arange(1, 5.5, 0.5)]
amount = rating
ax1.bar(x=ratings, height=list(ratings_dswide.values()), color=colors)

###
# Titles/Lables
addlabels(x=[rating for rating in ratings_dswide.keys()], y=[pop for pop in ratings_dswide.values()])
#
###

fig.tight_layout()
plt.show()

 Looks like there is an overwhelming number of very high ratings (5 = half of the customers). This looks overly optimistic but it is not unlikely as the study quoted earlier states that satisfied customers are more likely to give a review. Negative ones too but less so, according to this study (point 25).

 So we can expect the average overall rating to be quite high. Let's check.

In [ ]:
print(f"Average for the whole dataset = {ratings_avg_dswide}")
Average for the whole dataset = 4.085652664261414

Ok so we have our reference value, now we can use it as a baseline to determine clusters characteristics

1.2 : Statistics using the clusters attributed by k-means with k=4:¶

Goals :

 We might be able to distinguish different characteristics from cluster to cluster, hence giving us a way to compare and evaluate the relevance of a given variable. (Which could otherwise be done using SHAP)

Method :

 For starters, Let's use names of clusters and not their numbers. It will make more sense to talk about people and not numbers.


We will use a radar plot to represent the most important statistics and their average per group :

  • Delta Days
  • Average Rating
  • Delta Distance
In [ ]:
## Min Max Scaler doesnt support timedeltas, so we need to convert timedeltas to ints (days)

def get_delta_days(row):
    try:
        return int(row["expected_reality"].days)
    except ValueError:
        return np.nan
    
In [ ]:
df_cx["delta_days"] = df_cx.apply(get_delta_days, axis=1)
In [ ]:
mms = MinMaxScaler(feature_range=(0, 5))

subset = ["rating_avg", "delta_days", "distance_cx_seller", "recency", "frequency"]
scaled_subset = [
    "scaled_rating_avg", "scaled_delta_days",
    "scaled_distance_cx_seller", "scaled_recency",
    "scaled_frequency"
    ]

keepcols = ["rating_avg", "delta_days", "distance_cx_seller", "recency", "frequency", "k_cluster_name"]

df_cx_mms = df_cx[keepcols].copy()

df_cx_mms[scaled_subset] = mms.fit_transform(df_cx_mms[subset].to_numpy())

df_cx_mms.head()
Out[ ]:
rating_avg delta_days distance_cx_seller recency frequency k_cluster_name scaled_rating_avg scaled_delta_days scaled_distance_cx_seller scaled_recency scaled_frequency
0 4.0 -11.0 346.9776 44850283.0 0.053939 early_majority 3.75 1.886792 0.198569 3.358379 0.158495
1 5.0 -8.0 413.9531 24007314.0 0.100769 early_majority 5.00 2.028302 0.236898 1.797662 0.296100
2 5.0 1.0 29.5741 13051353.0 0.185360 early_majority 5.00 2.452830 0.016925 0.977282 0.544661
3 5.0 -13.0 19.3538 18840220.0 0.128406 investors 5.00 1.792453 0.011076 1.410752 0.377308
4 5.0 -6.0 219.7266 6939528.0 0.348612 late_majority 5.00 2.122642 0.125746 0.519630 1.024359
In [ ]:
investor_group = Cx_groups(cluster_name="investors", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "investors"])
early_m_group = Cx_groups(cluster_name="early_majority", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "early_majority"])
late_m_group = Cx_groups(cluster_name="late_majority", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "late_majority"])
lagg_group = Cx_groups(cluster_name="laggards", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "laggards"])
In [ ]:
radar_inv = investor_group.store_radar()
em_radar = early_m_group.store_radar()
lm_radar = late_m_group.store_radar()
lagg_radar = lagg_group.store_radar()

# Let's superpose the radars : (Class Cx Group saves the trace)
In [ ]:
fig = go.Figure()

fig.add_trace(go.Scatterpolar(investor_group.trace))
fig.add_trace(go.Scatterpolar(early_m_group.trace))
fig.add_trace(go.Scatterpolar(late_m_group.trace))
fig.add_trace(go.Scatterpolar(lagg_group.trace))

colors = ["#000331", "navy", "royalblue", "red"]

title = "Newly created stats, MinMaxed for each cluster determined during RFM segmentation Each cluster"

fig.update_layout(
  title=title,
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 5]
    )),
  showlegend=True
)

fig.show()
In [ ]:
investor_group.get_standard_stats()
investor_group.standard_stats
Out[ ]:
Group_stats(rating_avg=4.0801387953803925, delta_days=-11.95422070784788, distance_cx_seller=603.6218243071827, recency=24705325.87966058, frequency=0.14536305281344908)
In [ ]:
lagg_group.get_standard_stats()
lagg_group.standard_stats
Out[ ]:
Group_stats(rating_avg=4.079201838304257, delta_days=-11.880257357321577, distance_cx_seller=599.0734830752654, recency=24707661.49371684, frequency=0.14487645549994077)
In [ ]:
df_cx.describe()
Out[ ]:
customer_uid recency frequency monetary num_orders cluster_kmeans_4 cluster_DBSCAN lat lon rating_avg rating_ratio comment_ratio delta_delivery expected_reality distance_cx_seller delta_days
count 95922.000000 9.592200e+04 95922.000000 95922.000000 95922.000000 95922.000000 95922.000000 95653.000000 95653.000000 95211.000000 95922.000000 95922.000000 90212 90212 94683.000000 90212.000000
mean 48050.027408 2.488008e+07 0.144087 164.790904 1.034872 1.581326 0.040502 -21.186477 -46.175050 4.085653 0.992588 0.410418 12 days 10:59:52.732275085 -12 days +02:32:54.229592516 602.671260 -11.893817
std 27740.300523 1.323897e+07 0.106734 227.880166 0.214573 1.210225 3.080952 5.628223 4.066423 1.341295 0.085775 0.490193 8 days 19:34:30.557032786 9 days 08:37:12.484771075 596.799614 9.359172
min 1.000000 0.000000e+00 0.000000 0.000000 1.000000 0.000000 0.000000 -36.605374 -72.666706 1.000000 0.000000 0.000000 0 days 12:48:07 -51 days +00:00:00 0.000000 -51.000000
25% 24028.250000 1.415290e+07 0.071299 62.370000 1.000000 0.000000 0.000000 -23.588447 -48.110368 4.000000 1.000000 0.000000 6 days 18:15:44.750000 -17 days +00:00:00 189.232300 -17.000000
50% 48048.500000 2.322363e+07 0.105751 107.220000 1.000000 2.000000 0.000000 -22.926003 -46.631276 5.000000 1.000000 0.000000 10 days 04:58:03 -12 days +00:00:00 434.458400 -12.000000
75% 72077.750000 3.431603e+07 0.172824 182.100000 1.000000 3.000000 0.000000 -20.134916 -43.605058 5.000000 1.000000 1.000000 15 days 17:00:39.250000 -7 days +00:00:00 799.043250 -7.000000
max 96095.000000 6.677370e+07 1.701609 13664.080000 17.000000 3.000000 255.000000 42.184003 -8.577855 5.000000 1.000000 1.000000 103 days 21:34:37 55 days 00:00:00 8736.947600 55.000000

 As we can see here and in the plots above, the ratings are unexpectedly high and the standard deviation very low. On our sample, customers are overall satisfied/very satisfied. This behaviors with a very high mean and average, and very low std, even between the groups, is not beneficial pour any clustering model : if there are no noticeable differences between the clients, it will prove difficult to propose a comprehensive clustering mean to help Olist with customer segmentation. If the gain extracted from this new variable does not improve the model but on the contrary, flaws it, we will recommend the simple RFM approach.

In [ ]:
df_cx["rating_avg"].describe()
Out[ ]:
count    95211.000000
mean         4.085653
std          1.341295
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: rating_avg, dtype: float64
In [ ]:
df_cx["rating_avg"].quantile([0.1, 0.25, 0.50, 0.75, 0.90])
Out[ ]:
0.10    1.0
0.25    4.0
0.50    5.0
0.75    5.0
0.90    5.0
Name: rating_avg, dtype: float64
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 2),
    dpi=pc_dpi,
)

g = sns.boxplot(x="rating_avg", data=df_cx, ax=ax1)

###
# Titles/Lables
fig.suptitle("Rating repartition across the dataset")
#
###
fig.tight_layout()
plt.show()

Ok so more than 75% of the customer sample did not rate, in average, under 4. If there is something better to use as a satisfaction indicator, we'll take it. Meanwhile, let's investigate on this sub-sample of customers : the ones that have given a 2.5 rating or worse in average.

In [ ]:
df_cx_sample = df_cx[df_cx["rating_avg"] <= 2.5]

print(f"{len(df_cx_sample)} have rated under 2.5")
13909 have rated under 2.5
In [ ]:
df_cx_sample["k_cluster_name"].value_counts()
Out[ ]:
early_majority    4624
late_majority     3825
investors         2832
laggards          2628
Name: k_cluster_name, dtype: int64

 Ok so there's a lot of every one based from our previous clustering. Most important groups are the two majorities (which represent both around 32/35%), which was to be expected. What is interesting is to see that our "laggards" from the previous segmentation (which represented 18% of the dataset) are more or less as important here as the group we called "investors" in the first clustering. While they both represent around 20%, we have almost an equal amount of both. Which is interesting as we did not expect the "investor" group to give low ratings, on the contrary. This is a good indication that we lacked information in our RFM clustering and what we obtained during the feature engineering will likely improve the initial segmentation. In the rest of the analysis, we will also compare this group specifically and maybe diagnose the nature of this "low" rating.


2 : Delta Order / Delivery and its effects on customers' satisfaction.¶

2.1 : Delta Order / Delivery : choice of variable.¶

 While we have two timedelta variables for this indicator, there is one which will likely not result in an improvement for the clustering. Indeed, the raw value of ∆Order/Delivery might not be as useful as expected, and ∆expected/delivery would be much more precise.

 Let's say I pre-order something (P1) on the internet that's supposed to come out in 1 month, and I order another thing (P2) that I expect in a week. If every thing goes according to what has been indicated, I will receive P1 in a month and P2 in a week, those two orders won't have the same ∆order/delivery and that will impact negatively P1 (∆ = 1 month) over P2 (∆ = 1 week).
 Now the ∆Expected/Delivered takes into account the initial expectation. So in the case all is fine and on time, ∆ will be 0 for both P1 and P2, a negative ∆ will indicate that it was delivered early and a positive one, late. This is likely way more precise.

 We will use ∆expected/delivery from now on. Let's see it this has an impact on Cx satisfaction.

In [ ]:
x_axis = df_cx[df_cx["delta_days"].notna()]["delta_days"].unique()

x_height = dict.fromkeys(x_axis)

for x in x_height:
    x_height[x] = np.average(df_cx[(df_cx["delta_days"] == x) & (df_cx["rating_avg"].notna())]["rating_avg"].values.tolist())
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 4),
    dpi=pc_dpi,
)

ax1.scatter(x=list(x_height.keys()), y=list(x_height.values()))

###
# Titles/Lables
ax1.set_xlabel("Delta Days between expected and actual delivery")
ax1.set_ylabel("Average Rating for a given Delta Day")
fig.suptitle("Average rating by Days elapsed between expected and actual delivery")
#
###

fig.tight_layout()
plt.show()

Observation :¶

 As expected, clients' ratings are related to the punctuality of Olist. If the order arrives early or on time, clients' ratings are high, and this trend inverses as delta=0 is reached. People are more dissatisfied for every day Olist is late. The +30 days relatively high average bump in rating can probably be attributed to the lack of orders delivered beyond 30 days. Let's check if our dataset containing our relatively dissatisfied customers contains more clients that have been delivered late.

In [ ]:
amount_eot_all = len(df_cx[df_cx["early_on_time"] == True])
amount_eot_sample = len(df_cx_sample[df_cx_sample["early_on_time"] == True])

percentage_all = (amount_eot_all / len(df_cx)) * 100
percentage_sample = (amount_eot_sample / len(df_cx_sample)) * 100

print(f"There is {percentage_all}% of who have been delivered on time on the whole dataset")
print(f"There is {percentage_sample}% of who have been delivered\
 on time amongst the clients who have rated 2.5 or less (avg)")
There is 90.80607159984154% of who have been delivered on time on the whole dataset
There is 58.15658925875332% of who have been delivered on time amongst the clients who have rated 2.5 or less (avg)

Observation :¶

 There is indeed a great difference between the sample of dissatisfied customers and the whole dataset. 90.8% of the clients have received their orders early or on time on the whole dataset and just 58.2% of the clients for the sample containing people who have left a rating of 2.5 or less (avg.). We can safely say that a big part of the dissatisfied customers were disappointed because they were not delivered on time.


3 : Delta dist, is it a determining factor in the choice made by clients ?¶

3.1 : Number of orders in function of distance cx/seller¶

How ?¶

 We will take the min/max value of distance_cx_seller and create "distance blocks" (array (min, max, step)) where we will count the number of orders made by clients inside the current range (num_orders)

In [ ]:
print(f"There is {df_cx['distance_cx_seller'].isna().sum()} NA values in the dataset")
print(f"The Maximum distance is {df_cx['distance_cx_seller'].max()} Kms between seller and cx")

df_cx["distance_cx_seller"].describe()
There is 1239 NA values in the dataset
The Maximum distance is 8736.9476 Kms between seller and cx
Out[ ]:
count    94683.000000
mean       602.671260
std        596.799614
min          0.000000
25%        189.232300
50%        434.458400
75%        799.043250
max       8736.947600
Name: distance_cx_seller, dtype: float64

 It seems we have a lot of NA (which we will fill with the median, here 434.45 and change). Let's also plot the distribution to get a better sense of the extremes, which we will include as "more/less than whisker" to avoid over representation of the extremes

In [ ]:
distances = df_cx[df_cx["distance_cx_seller"].notna()]["distance_cx_seller"].values.tolist()
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 4),
    dpi=pc_dpi,
)

ax1.boxplot(x=distances, vert=False, showbox=True, showmeans=True, widths=(0.4))

###
# Titles/Lables
ax1.set_xlabel("Number of average kilometers between customer and seller")
fig.suptitle("Representation of clients based on their average distance to seller when order was placed")
ax1.set_yticks([])
#
###

fig.tight_layout()
plt.show()

Observation :¶

 Looks like most people order between 0 and 2k Kms, lets zoom on this area

In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 4),
    dpi=pc_dpi,
)

ax1.boxplot(x=distances, vert=False, showbox=True, showmeans=True, widths=(0.4))

###
# Titles/Lables
ax1.set_xlabel("Number of average kilometers between customer and seller")
fig.suptitle("Representation of clients based on their average distance to seller when order was placed\
\nzoom on less than 2000")
ax1.set_xlim(left=(-100), right=(2000))
ax1.set_xticks(range(0, 2000, 200))
ax1.set_yticks([])
#
###

fig.tight_layout()
plt.show()

 Let's fillna with the median, get 10/90% values and take a peak

In [ ]:
df_cx["distance_cx_seller"] = df_cx["distance_cx_seller"].fillna(df_cx["distance_cx_seller"].median())

distances_post_fill = df_cx[df_cx["distance_cx_seller"].notna()]["distance_cx_seller"].values.tolist()
In [ ]:
quantiles_distance = df_cx["distance_cx_seller"].quantile([0.10, 0.90])

llim = round(quantiles_distance[0.10], ndigits=4)  # Rounded to meter
rlim = round(quantiles_distance[0.90], ndigits=4)  # Rounded to meter

print(f"10% under {llim} Kms, 90% under {rlim} Kms")
10% under 36.4582 Kms, 90% under 1455.8878 Kms
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 4),
    dpi=pc_dpi,
)

ax1.boxplot(x=distances_post_fill, vert=False, showbox=True, showmeans=True, widths=(0.4), sym="")

###
# Titles/Lables
ax1.set_xlabel("Number of average kilometers between customer and seller")
fig.suptitle("Representation of clients based on their average distance to seller when order was placed\
\nzoom on less than 2000, fliers removed")
ax1.set_xlim(left=(-100), right=(2000))
ax1.set_xticks(range(0, 2000, 200))
ax1.set_yticks([])
#
###

fig.tight_layout()
plt.show()

 Filling NA by the median didn't impact negatively our data. Let's set our range to : under 50 kms, above 1500 kms with step 100kms.

In [ ]:
distance_range = np.arange(50, 1551, 100)

distance_dict = dict.fromkeys(distance_range)
In [ ]:
previous = 0
for distance in distance_range:
    if distance != distance_range[-1]:
        amnt_orders = df_cx[(df_cx["distance_cx_seller"] > previous)
                        & (df_cx["distance_cx_seller"] < distance)]["num_orders"].sum()

    else:  # Last distance of distance range, just need "more than distance"
        amnt_orders = df_cx[(df_cx["distance_cx_seller"] > 1550)]["num_orders"].sum()

    distance_dict[distance] = amnt_orders        
    previous = distance
In [ ]:
def addlabels_ranged100(x, y):
    for i in range(len(x)):
        plt.text(i * 100 + 50, y[i], f"{y[i]}", ha="center", bbox=dict(facecolor="white", alpha=1))
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 6),
    dpi=pc_dpi,
)

magma = plt.get_cmap("magma").reversed()

height_normalized = [value / max(list(distance_dict.values())) for value in list(distance_dict.values())]

colors = magma(height_normalized)

ax1.bar(x=list(distance_dict.keys()), height=list(distance_dict.values()), width=90, color=colors)

###
# Titles/Lables
ax1.set_xticks(distance_range)
ax1.set_xlabel("Average distance from seller")
addlabels_ranged100(x=[distance for distance in distance_dict.keys()], y=[amnt for amnt in distance_dict.values()])
#
###

fig.tight_layout()
plt.show()

Note : As we drew our limits between 10% and 90% of the dataset, the high amount of order above 1550 kms is to be expected as it represents 10% of the dataset

Observations :¶

 This graph shows that, indeed, people orders are more often than not to seller that are relatively close, orders from seller above 550 kms away begin to drop quite quickly. We can hypothesize that expected delivery time is shorter for shorter distances and clients prefer to get an item quickly. Although there is a non negligible part of orders ranging from 650km and above, it is possible that it is for specific products that have no alternatives locally or from customers who live in more remotes part of Brazil.

 This is a more interesting variable than expected. As we know, alternatives like Amazon for example use a subscription method like Prime to get rid of delivery fees altogether. It hints that it is not the case here or there is an incentive (fee, time etc.)

4 : PCA on selected variables for classification¶

4.1 : Variable pre-selection and conversions¶

Why ?¶

 PCA will work better on certain data, while I have coded a layer of abstraction above it to make it easier to use and streamline the process, we still need to select our variables accordingly.

  • PCA works on numerical values, so we need a few conversions :
    • Boolean values must go from True/False to 1/0
    • Datetimes and timedeltas must be converted in 1. an understandable way for the humans 2. an understandable way for the machine, so depending on the case we will use days as the main metric (most_recent_order is a repetition of recency, so we will just drop it)
    • Cluster indicators from previous clustering will also be dropped as well as any information regarding this (cluster name etc.)
    • As stated, delta_delivery is lacking compared to expected/reality, which we have already converted to delta_days, so we'll drop both and keep "delta_days"
    • Distance cx/seller proved to be an interesting variable so we'll keep it.
    • For now we will also remove the lat/lon of all clients.
    • customer_uid is purely informative and will be dropped to be re added after (to identify who's who)
    • order_id_list is also purely informative, it will not help the model in any way

By default, converting a boolean column into a int column will do the 1/0 conversion for us, not necessary to reinvent the wheel there

 We will begin by using IQR method to remove outliers from the dataset.

In [ ]:
def delta_creation(row):
    """
    Returns the difference between now and the most ancient order
    Most ancient order is understood as the account creation date for lack
    of more specific info
    """

    now = np.datetime64("now")
    then = row["most_ancient_order_dt"]
    delta = now - then
    delta = np.timedelta64(delta, "D")
    return delta.astype(int)

Let's first convert "most_ancient_order_dt", which we assimilate to the account creation. We want the number of days between then and now\ We'll keep the col for the stability analysis.

In [ ]:
df_cx["delta_creation"] = df_cx.apply(delta_creation, axis=1)
In [ ]:
df_cx["has_rated"] = df_cx["has_rated"].astype(int)
df_cx["has_commented"] = df_cx["has_commented"].astype(int)
df_cx["early_on_time"] = df_cx["early_on_time"].astype(int)
In [ ]:
df_cx.head()
Out[ ]:
customer_uid most_ancient_order_dt most_recent_order_dt recency frequency monetary num_orders cluster_kmeans_4 cluster_DBSCAN k_cluster_name ... rating_ratio comment_ratio has_rated has_commented delta_delivery expected_reality early_on_time distance_cx_seller delta_days delta_creation
0 1 2017-05-16 15:05:35 2017-05-16 15:05:35 44850283.0 0.053939 146.87 1 3 0 early_majority ... 1.0 0.0 1 0 8 days 19:30:00 -11 days 1 346.9776 -11.0 1969
1 2 2018-01-12 20:48:24 2018-01-12 20:48:24 24007314.0 0.100769 335.48 1 3 0 early_majority ... 1.0 0.0 1 0 16 days 15:52:55 -8 days 1 413.9531 -8.0 1728
2 3 2018-05-19 16:07:45 2018-05-19 16:07:45 13051353.0 0.185360 157.73 1 3 0 early_majority ... 1.0 0.0 1 0 26 days 01:51:06 1 days 0 29.5741 1.0 1601
3 4 2018-03-13 16:06:38 2018-03-13 16:06:38 18840220.0 0.128406 173.30 1 1 0 investors ... 1.0 0.0 1 0 14 days 23:57:47 -13 days 1 19.3538 -13.0 1668
4 5 2018-07-29 09:51:30 2018-07-29 09:51:30 6939528.0 0.348612 252.25 1 0 0 late_majority ... 1.0 1.0 1 1 11 days 11:04:18 -6 days 1 219.7266 -6.0 1531

5 rows × 24 columns

In [ ]:
## IQR
print("Before", len(df_cx))
Before 95922
In [ ]:
target_iqr = [
    "num_orders", "rating_avg", "distance_cx_seller",
    "delta_days", "delta_creation"
    ]
In [ ]:
for column in df_cx.columns:
    if column in target_iqr:
        remove_outliers(column_eval=column, df=df_cx)
In [ ]:
print("After", len(df_cx))
After 85566
In [ ]:
dropcols = [
    "customer_uid", "cluster_kmeans_4", "cluster_DBSCAN",
    "k_cluster_name", "delta_delivery", "expected_reality",
    "order_id_list", "most_recent_order_dt", "lat", "lon",
    "most_ancient_order_dt",
    ]

df_model = df_cx.drop(columns=(dropcols), errors="ignore")

df_model.head()
Out[ ]:
recency frequency monetary num_orders rating_avg rating_ratio comment_ratio has_rated has_commented early_on_time distance_cx_seller delta_days delta_creation
0 44850283.0 0.053939 146.87 1 4.0 1.0 0.0 1 0 1 346.9776 -11.0 1969
1 24007314.0 0.100769 335.48 1 5.0 1.0 0.0 1 0 1 413.9531 -8.0 1728
2 13051353.0 0.185360 157.73 1 5.0 1.0 0.0 1 0 0 29.5741 1.0 1601
3 18840220.0 0.128406 173.30 1 5.0 1.0 0.0 1 0 1 19.3538 -13.0 1668
4 6939528.0 0.348612 252.25 1 5.0 1.0 1.0 1 1 1 219.7266 -6.0 1531

4.2 : PCA and interpretations : ¶

For more info about the class Easy_pca, documentation is provided in the model as docstring

  • First we want to get a scree plot to know about the principal components we need.
  • Then we'll want to take a look at the correlation circles to have a better grasp on what individual selected PC is.
  • For ease of use, we can also display a table with the coefficients of each variable
In [ ]:
epca_model = Easy_pca(dataset=df_model)
In [ ]:
fig = epca_model.get_scree_plot(show=False)

fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(8)

fig.tight_layout()
fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/124908471.py:8: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

Looks like we get around 90% cumulative inertia with PC1 through PC5. Let's take a closer look.

In [ ]:
fig = epca_model.display_circles(couple_pc=(0, 1), show=False)

fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(5)

fig.tight_layout()
fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/2243853122.py:8: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

In [ ]:
fig = epca_model.display_circles(couple_pc=(2, 3), show=False)

fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(5)

fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/1128338058.py:7: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

In [ ]:
fig = epca_model.display_circles(couple_pc=(3, 4), show=False)

fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(5)

fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/102305747.py:7: UserWarning:

Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.

In [ ]:
epca_model.biplot((0, 1))
In [ ]:
epca_model.biplot((2, 3))
In [ ]:
epca_model.biplot((3, 4))
In [ ]:
df_contribution = epca_model.show_contribution(lim_pc=5)

df_contribution
Out[ ]:
PC1 PC2 PC3 PC4 PC5
recency -5.839749e-01 0.061885 -1.397124e-02 -4.374793e-02 -3.328919e-02
frequency 5.407418e-01 -0.044440 1.523987e-03 6.366289e-02 1.824991e-02
monetary -6.013037e-03 -0.040135 1.608210e-02 6.883383e-02 6.452213e-01
num_orders 1.387779e-17 -0.000000 2.775558e-17 1.110223e-16 5.551115e-17
rating_avg 5.341263e-02 0.377926 -1.860646e-01 2.836695e-01 -1.405585e-01
rating_ratio 1.334033e-02 -0.173454 -6.434531e-01 -2.335224e-01 2.432052e-02
comment_ratio -7.940703e-02 -0.566676 9.576399e-03 3.862057e-01 -9.372702e-02
has_rated 1.334033e-02 -0.173454 -6.434531e-01 -2.335224e-01 2.432052e-02
has_commented -7.940703e-02 -0.566676 9.576399e-03 3.862057e-01 -9.372702e-02
early_on_time 2.186455e-02 0.304277 -2.883941e-01 4.878433e-01 -9.015474e-02
distance_cx_seller -5.782445e-02 -0.018728 -3.576837e-03 8.081852e-02 7.211011e-01
delta_days 7.658825e-02 -0.224894 2.308622e-01 -5.049915e-01 -1.204827e-01
delta_creation -5.839762e-01 0.061881 -1.396981e-02 -4.373149e-02 -3.329034e-02
Inertia 2.320000e+01 19.100000 1.700000e+01 1.270000e+01 9.100000e+00

We have there the coefficients of each variable, to get a better grasp on each one and to see what subset of variables contribute the most, we will use abs values for each PC, and rename it as we go.

In [ ]:
abs(df_contribution["PC1"]).sort_values(ascending=False)
Out[ ]:
Inertia               2.320000e+01
delta_creation        5.839762e-01
recency               5.839749e-01
frequency             5.407418e-01
comment_ratio         7.940703e-02
has_commented         7.940703e-02
delta_days            7.658825e-02
distance_cx_seller    5.782445e-02
rating_avg            5.341263e-02
early_on_time         2.186455e-02
has_rated             1.334033e-02
rating_ratio          1.334033e-02
monetary              6.013037e-03
num_orders            1.387779e-17
Name: PC1, dtype: float64

As we can see, PC1 represents Time. The frequency is positively correlated with PC1 while delta_creation and the recency are negatively correlated. It is basically the time since last order.

  • Proposed name : time_impact
In [ ]:
abs(df_contribution["PC2"]).sort_values(ascending=False)
Out[ ]:
Inertia               19.100000
comment_ratio          0.566676
has_commented          0.566676
rating_avg             0.377926
early_on_time          0.304277
delta_days             0.224894
has_rated              0.173454
rating_ratio           0.173454
recency                0.061885
delta_creation         0.061881
frequency              0.044440
monetary               0.040135
distance_cx_seller     0.018728
num_orders             0.000000
Name: PC2, dtype: float64

While there is more of a mix in PC2, we can clearly see that it represents the customer's involvement with the order : the variables on ratings and comments are both very important in this PC. While we see delta_days scoring quite high, we already established it was highly correlated with the ratings and user's involvement.

  • Proposed name : general_involvement
In [ ]:
abs(df_contribution["PC3"]).sort_values(ascending=False)
Out[ ]:
Inertia               1.700000e+01
has_rated             6.434531e-01
rating_ratio          6.434531e-01
early_on_time         2.883941e-01
delta_days            2.308622e-01
rating_avg            1.860646e-01
monetary              1.608210e-02
recency               1.397124e-02
delta_creation        1.396981e-02
comment_ratio         9.576399e-03
has_commented         9.576399e-03
distance_cx_seller    3.576837e-03
frequency             1.523987e-03
num_orders            2.775558e-17
Name: PC3, dtype: float64

While PC2 and PC3 might seem similar, PC3 seems entirely focused on ratings and seems to ignore comments. early_on_time and delta_days score also high, likely for the same reason as PC2. Taking the real values (not abs), we can see that rating avg. and comment ratio are anti-correlated to the variable "delta_days", it is interpretable as general dissatisfaction : when the value of delta days climbs, the ratings fall, as we have established in 2.1

  • Proposed name : rating_delay / dissatisfaction
In [ ]:
abs(df_contribution["PC4"]).sort_values(ascending=False)
Out[ ]:
Inertia               1.270000e+01
delta_days            5.049915e-01
early_on_time         4.878433e-01
comment_ratio         3.862057e-01
has_commented         3.862057e-01
rating_avg            2.836695e-01
has_rated             2.335224e-01
rating_ratio          2.335224e-01
distance_cx_seller    8.081852e-02
monetary              6.883383e-02
frequency             6.366289e-02
recency               4.374793e-02
delta_creation        4.373149e-02
num_orders            1.110223e-16
Name: PC4, dtype: float64

The absolute values of PC3 and PC4 are very similar, for a very good reason : while PC3 represents discontentment, PC4 represents satisfaction : delta days is highly negatively correlated with PC4, while rating average and early/on time are highly positively correlated with PC4. If delta days is low, and the order is on time or early, cx is happy.

  • Proposed name : overall_satisfaction
In [ ]:
abs(df_contribution["PC5"]).sort_values(ascending=False)
Out[ ]:
Inertia               9.100000e+00
distance_cx_seller    7.211011e-01
monetary              6.452213e-01
rating_avg            1.405585e-01
delta_days            1.204827e-01
comment_ratio         9.372702e-02
has_commented         9.372702e-02
early_on_time         9.015474e-02
delta_creation        3.329034e-02
recency               3.328919e-02
has_rated             2.432052e-02
rating_ratio          2.432052e-02
frequency             1.824991e-02
num_orders            5.551115e-17
Name: PC5, dtype: float64

This PC is highly correlated with 3 variables : monetary, num_orders and distance_cx_seller (negatively) : we established in 3.1 that order amounts decreased with the distance so it makes sense. We can think of PC5 as a variable that describes the volume of transaction and the overall sum of money people spend on Olist.

  • Proposed_name : value_and_volume
In [ ]:
pc_keep = ["PC1", "PC2", "PC3", "PC4", "PC5"]

df_model_pca = epca_model.pcframe[pc_keep]

df_model_pca.head()
Out[ ]:
PC1 PC2 PC3 PC4 PC5
0 -2.057101 1.183986 -0.225623 -0.804880 -0.239802
1 0.071706 1.138780 -0.229457 -0.521352 0.420751
2 1.541139 -0.235138 1.024167 -2.749404 -0.598572
3 0.694076 1.254976 -0.355856 -0.325240 -0.736289
4 2.579234 -1.459571 -0.103386 1.132912 -0.487445
In [ ]:
rename_dict = dict.fromkeys(pc_keep)

rename_dict["PC1"] = "time_impact"
rename_dict["PC2"] = "general_involvment"
rename_dict["PC3"] = "overall_discontentment"
rename_dict["PC4"] = "overall_satisfaction"
rename_dict["PC5"] = "value_and_volume"

df_model_pca = df_model_pca.rename(columns=rename_dict)
In [ ]:
df_model_pca.head()
Out[ ]:
time_impact general_involvment overall_discontentment overall_satisfaction value_and_volume
0 -2.057101 1.183986 -0.225623 -0.804880 -0.239802
1 0.071706 1.138780 -0.229457 -0.521352 0.420751
2 1.541139 -0.235138 1.024167 -2.749404 -0.598572
3 0.694076 1.254976 -0.355856 -0.325240 -0.736289
4 2.579234 -1.459571 -0.103386 1.132912 -0.487445

Moving forward we will use this dataset (same index) for clustering.

5 : Modelizations of adjusted parameters and exports :¶

  • From what the exploratory analysis revealed, customer satisfaction is largely based on punctuality from Olist. The distance also plays an important role. So, before the modelization. The early conclusion is that improvements must be done on those 2 factors :
    • The first by improving the reliability of shipping
    • The second by connecting more local sellers or opening warehouses in areas that are too remote for most sellers, as we saw that distance played a big role on the amount of orders and the repeat of orders (PCA)
  • We will, as we did with the RFM variables, try to cluster the customers based on the data we gathered post-PCA.
    • Will be used : K-means and DBSCAN, we will not use Agglomerative clustering for the same reason we did not use it for the RFM approach, it is too resource hungry on large datasets
  • We will choose the best clustering algorithm with the best hyperparameter(s) and take a broader look the clusters themselves, using the real values we had before applying a PCA.
  • We will export the final dataset and move on to another document solely focused on maintenance of the chosen algorithm.

    Why ? : This notebook is quite heavy a resource consuming. We will take the maintenance as a separate document to avoid :

  • 1 : Overusing the kernel which is already pretty full and will execute slowly on different setups (native jupyter, jupter-lab, vscode etc.)

  • 2 : A logic split between the interpretation, which can be changed if new data appears and disturbs the clusters, and the maintenance, which should be executed independently once the right algorithm and hyperparameter(s) have been chosen.

5.1 : K-Means optimization :¶

In [ ]:
df_model_pca.head()
Out[ ]:
time_impact general_involvment overall_discontentment overall_satisfaction value_and_volume
0 -2.057101 1.183986 -0.225623 -0.804880 -0.239802
1 0.071706 1.138780 -0.229457 -0.521352 0.420751
2 1.541139 -0.235138 1.024167 -2.749404 -0.598572
3 0.694076 1.254976 -0.355856 -0.325240 -0.736289
4 2.579234 -1.459571 -0.103386 1.132912 -0.487445
In [ ]:
k_range = range(2, 10)

k_means_optimizer(data=df_model_pca, k_range=k_range)

With this approach, 5 | 6 Clusters seems to be the way to go, it maximizes the Silhouette Score while having an acceptable SSE Trying 6 first as it has the best Sil Score, otherwise, 5 "might" be a bit more explainable, we'll see in the groups population, we will iterate 5-6 if there's a cluster with less than 1000 customers or so.

-- > 5 is more global, 6 does not seem to cluster correctly

In [ ]:
km = KMeans(n_clusters=5)
y_predicted = km.fit_predict(df_model_pca)
In [ ]:
df_model_pca["cluster_km"] = y_predicted
df_cx["cluster_km"] = y_predicted  # Index was kept between the dimsensionnal reductions.

5.2 : DBSCAN Clustering :¶

 We will also use DBSCAN with the same idea as RFM #4.2, with min points as dimension + 1 (4)  Epsilon can be determined using k neighbors. Using a graph to represent the avg distance between a point and its k-neighbors (here 4 : dimension + 1). Zooming in and using the elbow method help us to focus on the best potential epsilon.

In [ ]:
neighbors_matrix = df_model_pca.drop(columns=["cluster_km"]).to_numpy()
nneighbors = NearestNeighbors(n_neighbors=4, n_jobs=-1)  # dataset dim + 1

nneighbors.fit(X=neighbors_matrix)

distances, potential_eps = nneighbors.kneighbors(neighbors_matrix)

distances = np.sort(distances, axis=0)
distances_plot = distances[:,1]
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(16, 8),
    dpi=pc_dpi,
)

ax1.plot(distances_plot)

###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
fig.suptitle("Points sorted by distance - Neighbors = 6")
#
###

fig.tight_layout()
plt.show()
In [ ]:
# Lets innovate a bit and use plotly to zoom in

fig = px.line(distances_plot)
fig.show()

Around eps = 1.1 | 1.2 seems to be the tipping point

In [ ]:
dbs = DBSCAN(eps=1.15, min_samples=6, n_jobs=-1)

y_predict_dbs = dbs.fit_predict(df_model_pca.drop(columns=["cluster_k4", "cluster_DBSCAN"], errors="ignore"))
In [ ]:
df_model_pca["cluster_DBSCAN"] = y_predict_dbs
In [ ]:
df_model_pca["cluster_DBSCAN"].unique()
Out[ ]:
array([ 0,  1,  2,  3,  4,  5, -1,  7,  6])
Likely :¶

Theres a lot more clusters than Kmeans.
Since the dataset seems linearly separable, DBSCAN doesn't do a great job at clustering.

5.3 : Observations of clustering both with PCA applied and using UMAP :¶

  • Why not UMAP earlier ?

 While it is very good at dimensional reduction, it makes the axes hard to interpret. It helps visualizing but the more robust explanation will be through radars of the average and median of each group.

  • How ?

 We will use both radar charts to plot the average and/or median client/cluster and UMAP to reduce the dimensions from 6 to 3, making it possible to visualize.

In [ ]:
df_model_pca.head()
Out[ ]:
time_impact general_involvment overall_discontentment overall_satisfaction value_and_volume cluster_km cluster_DBSCAN
0 -2.057101 1.183986 -0.225623 -0.804880 -0.239802 4 0
1 0.071706 1.138780 -0.229457 -0.521352 0.420751 4 0
2 1.541139 -0.235138 1.024167 -2.749404 -0.598572 3 0
3 0.694076 1.254976 -0.355856 -0.325240 -0.736289 0 1
4 2.579234 -1.459571 -0.103386 1.132912 -0.487445 0 2
In [ ]:
df_model_pca_no_dbs = df_model_pca.drop(columns=["cluster_DBSCAN"], errors="ignore")
In [ ]:
k_zero = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=0, cluster_col="cluster_km")
k_one = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=1, cluster_col="cluster_km")
k_two = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=2, cluster_col="cluster_km")
k_three = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=3, cluster_col="cluster_km")
k_four = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=4, cluster_col="cluster_km")
In [ ]:
radar_zero = k_zero.store_radar()
radar_one = k_one.store_radar()
radar_two = k_two.store_radar()
radar_three = k_three.store_radar()
radar_four = k_four.store_radar()

# Let's superpose the radars : (Class Cx Group saves the trace)
In [ ]:
fig = go.Figure()

fig.add_trace(go.Scatterpolar(k_zero.trace))
fig.add_trace(go.Scatterpolar(k_one.trace))
fig.add_trace(go.Scatterpolar(k_two.trace))
fig.add_trace(go.Scatterpolar(k_three.trace))
fig.add_trace(go.Scatterpolar(k_four.trace))

title = "Stats of PCA vars (min maxed), clusters determined by k-means w/ k=4 on pca dataset"

fig.update_layout(
  title=title,
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=True
)

fig.show()
In [ ]:
df_model_pca["cluster_km"].value_counts()
Out[ ]:
4    28430
1    26172
0    23287
3     7046
2      631
Name: cluster_km, dtype: int64

 There's a cluster with around 650 clients, we can safely classify them in the atypical data category. Likely the result of an input error or an atypical behavior. Those are clients that we cannot interpret.

 We can see at first glance than a cluster containing around 7000 clients displays an average discontent higher than the other clusters. It can be seen as clients who are not satisfied with Olist/had a bad experience on the site.

 A group that is less distinguishable with those variables but still very important is the cluster that shares a lot of common stats with the two other majorities, except time_impact which is way higher in avg. than the other clusters. It points to very recent clients as time_impact (PC1) is negatively correlated with the variables that are used to determine the age of the account (namely the recency, and delta_creation, which is the agreed upon data pointing towards the first order of the account) higher values in these variables indicate that the account is recent and/or used recently.This is a group we must fidelize.

 We also have two majorities sharing the same stats. in avg., maj1 is remarkable as it has less involvement and less satisfaction than maj2.

Note : Cluster names cannot be given here as the re exec of KMeans will not always result the same cluster_number, so rerunning the script will give different numbers to clusters.

We can use UMAP to try and visualize them.

In [ ]:
reducer = UMAP(n_components=3, n_jobs=-1)

df_umap = df_model_pca[[
    "time_impact", "general_involvment", "overall_discontentment",
    "overall_satisfaction", "value_and_volume", "cluster_km"
    ]].copy()
In [ ]:
embedding = reducer.fit_transform(df_umap.drop(columns=["cluster_km"]))
In [ ]:
df_reduced = pd.DataFrame(embedding)

df_reduced["cluster_km"] = df_umap["cluster_km"].astype(str)  # Index kept, convert to str for plotly
In [ ]:
marker_style = {
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_reduced, x=0,
    y=1, z=2, color="cluster_km",
    width=4 * pc_dpi, height=3 * pc_dpi,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="Visualisation of dataset using UMAP reducer",
)

fig.update_traces(marker=marker_style)

fig.show()

 Even though UMAP abstracted the variables we can see that indeed, the clusters we associated to unsatisfied customers is clearly visible. As well the the atypicals. The two majorities are well defined and we can see that the group we associated with the new customers is a sort of an extension of the two large majorities.

 We will study in details those groups before studying the maintenance suggestions.


We can go back to the original dataset and remove the clusters defined in RFM.

In [ ]:
df_cx.drop(columns=["cluster_kmeans_4", "cluster_DBSCAN", "k_cluster_name"], inplace=True)
In [ ]:
df_cx.columns
Out[ ]:
Index(['customer_uid', 'most_ancient_order_dt', 'most_recent_order_dt',
       'recency', 'frequency', 'monetary', 'num_orders', 'lat', 'lon',
       'order_id_list', 'rating_avg', 'rating_ratio', 'comment_ratio',
       'has_rated', 'has_commented', 'delta_delivery', 'expected_reality',
       'early_on_time', 'distance_cx_seller', 'delta_days', 'delta_creation',
       'cluster_km'],
      dtype='object')
In [ ]:
df_cx.head()
Out[ ]:
customer_uid most_ancient_order_dt most_recent_order_dt recency frequency monetary num_orders lat lon order_id_list ... comment_ratio has_rated has_commented delta_delivery expected_reality early_on_time distance_cx_seller delta_days delta_creation cluster_km
0 1 2017-05-16 15:05:35 2017-05-16 15:05:35 44850283.0 0.053939 146.87 1 -20.509897 -47.397866 [88493] ... 0.0 1 0 8 days 19:30:00 -11 days 1 346.9776 -11.0 1969 4
1 2 2018-01-12 20:48:24 2018-01-12 20:48:24 24007314.0 0.100769 335.48 1 -23.726853 -46.545746 [90419] ... 0.0 1 0 16 days 15:52:55 -8 days 1 413.9531 -8.0 1728 4
2 3 2018-05-19 16:07:45 2018-05-19 16:07:45 13051353.0 0.185360 157.73 1 -23.527788 -46.660310 [22558] ... 0.0 1 0 26 days 01:51:06 1 days 0 29.5741 1.0 1601 3
3 4 2018-03-13 16:06:38 2018-03-13 16:06:38 18840220.0 0.128406 173.30 1 -23.496930 -46.185352 [32181] ... 0.0 1 0 14 days 23:57:47 -13 days 1 19.3538 -13.0 1668 0
4 5 2018-07-29 09:51:30 2018-07-29 09:51:30 6939528.0 0.348612 252.25 1 -22.987222 -47.151073 [69903] ... 1.0 1 1 11 days 11:04:18 -6 days 1 219.7266 -6.0 1531 0

5 rows × 22 columns

In [ ]:
df_cx.columns
df_model_pca_no_dbs["most_ancient_order_dt"] = df_cx["most_ancient_order_dt"]  # Indexes are kept
df_model_pca_no_dbs.drop(columns=["cluster_km"]).to_pickle(path="../pickles/dataset_pca_maintenance.pkl")  # Export for maintenance

To name the clusters we can go with

  • Atypical (atypc_cx)
  • Unsatisfied (unsat_cx)
  • Majority 1 (maj_1)
  • Majority 2 (maj_2)
  • New customers (new_cx)

--

We cannot predict which x number will be y group name so we will use the input method and use the radars to reference the correct groups

In [ ]:
## Important : Maj 1 is the less satisfied majority, it will be important later !

atypc_cx = int(input("Atypical : "))
new_cx = int(input("New customers : "))
maj1 = int(input("Maj1 : "))
maj2 = int(input("Maj2 : "))
unsat_cx = int(input("Unsatisfied : "))
In [ ]:
rename_clusters = {
    atypc_cx: "atypical", new_cx: "new_client",
    maj1: "majority_1", maj2: "majority_2",
    unsat_cx: "unsatisfied"
    }

5.4 : A broader look at the clusters using the actual values instead of PCA/UMAP :¶

  • This will give us a hint of what might be a course of action for certain clusters, highlighting variables that have been blurred during PCA/UMAP
  • We will have a better idea of who's who if we do not use abstract values (standardized and scaled)

Foreword : We will drop the cluster containing atypical/outliers customers, a cluster size of less than 1% of the dataset will either be difficult to interpret or too time and resource consuming and not cost effective at a scale that small.

In [ ]:
def name_clusters(row, replacement_dict):
    key = int(row["cluster_km"])
    return replacement_dict[key]
In [ ]:
df_cx["kmeans_name"] = df_cx.apply(func=name_clusters, args=([rename_clusters]), axis=1)
In [ ]:
# Pre drop

len(df_cx)
Out[ ]:
85566
In [ ]:
# Removing atypical group

df_cx = df_cx[df_cx["kmeans_name"] != "atypical"]
In [ ]:
# Post drop
len(df_cx)
Out[ ]:
84935

5.4.1 : Can we blend the two majorities ? If not, what are their differences ?¶

 Focusing on the big picture first, we have two majorities which seem to share a lot, but UMAP showed that several variables were significatively different. On the radar chart we can hypothesize that, because it is the involvement component that mainly differs them, we need to look at comments and ratings ratios. The ratings/satisfaction seem, although one cluster shows a minor decrease in satisfaction, the most important differences.

 As we saw in early visualizations, both ratios are really close to 1 or 0 (most customers only ordered once). A plot will not easily capture the difference, we might as well use the average per cluster

In [ ]:
## Involvment :

maj1_avg_comment_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_1"]["comment_ratio"].values)
maj2_avg_comment_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_2"]["comment_ratio"].values)

print(f"maj1 average comment ratio = {maj1_avg_comment_ratio}, maj2 average comment ratio = {maj2_avg_comment_ratio}")
maj1 average comment ratio = 1.0, maj2 average comment ratio = 0.0
In [ ]:
maj1_avg_rating_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_1"]["rating_ratio"].values)
maj2_avg_rating_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_2"]["rating_ratio"].values)

print(f"maj1 average rating ratio = {maj1_avg_rating_ratio}, maj2 average rating ratio = {maj2_avg_rating_ratio}")
maj1 average rating ratio = 1.0, maj2 average rating ratio = 1.0

Observations :¶

 While the average rating ratio is as expected, it seems the major factor separating the two clusters is the fact that majority 2 has not left a comment. It explains why there was a drop in the PC summarizing the general involvement.
 Let's take a look at the ratings, maybe there are more differences. If the ratings are significantly different, we will use two boxplots to get a better overall view.

In [ ]:
maj1_avg_ratings = np.average(df_cx[df_cx["kmeans_name"] == "majority_1"]["rating_avg"].values)
maj2_avg_ratings = np.average(df_cx[df_cx["kmeans_name"] == "majority_2"]["rating_avg"].values)

print(f"maj1 average rating ratio = {maj1_avg_ratings}, maj2 average rating ratio = {maj2_avg_ratings}")
maj1 average rating ratio = 3.9249579703499924, maj2 average rating ratio = 4.473619416109743

It's significant enough, let's take a broader look

In [ ]:
fig, ((ax1), (ax2)) = plt.subplots(
    ncols=1,
    nrows=2,
    figsize=(8, 5),
    dpi=pc_dpi,
)

ax1.boxplot(x=df_cx[df_cx["kmeans_name"] == "majority_1"]["rating_avg"].values, vert=False, showmeans=True, widths=0.6)
ax2.boxplot(x=df_cx[df_cx["kmeans_name"] == "majority_2"]["rating_avg"].values, vert=False, showmeans=True, widths=0.6)


###
# Titles/Lables
ax1.set(yticklabels=[])
ax1.set(title="Repartition of ratings, first majority")
ax2.set(yticklabels=[])
ax2.set(title="Repartition of ratings, second majority")
#
###

fig.tight_layout()
plt.show()

Observation :¶

 While both ratings are high, the first majority seems to be less enthusiast than the second majority. Not enough to be an issue as ratings are well above the 2.5 mark, but it would be important to keep an eye on majority 1 to avoid a downward trend. Applying the same measures as we will recommend for the overall dissatisfied customers, although not at the same intensity, might be a way to prevent clients from majority 1 to become unsatisfied.

 To conclude, we cannot merge the majorities, and we also need to keep an eye on majority 1, keeping them happy and making them happier as, by their population, they represent a sizable portion of the dataset. (around 25-30%) -- Renaming customers part of maj1 to can_shift. They are not unhappy yet, but could become so.

In [ ]:
df_cx["kmeans_name"].replace(to_replace={"majority_1": "can_shift"}, inplace=True)

5.4.2 : What causes the discontent of the unsatisfied group ?¶

  • This particular cluster shows sign of discontent : lack of involvement as well as a high value for the PC we associated with unsatisfaction (and, logically, lower value in the satisfaction PC).
  • It is necessary to have a preliminary diagnostic to :

    • Possibly shift the customers towards a better overall satisfaction
    • Preventing customers that can_shift towards this group to do so.

  • We will take a look at average ratings, delays and distances to try and find a pattern if it exists.

In [ ]:
# Rating repartition : 

fig, ((ax1), (ax2)) = plt.subplots(
    ncols=1,
    nrows=2,
    figsize=(8, 5),
    dpi=pc_dpi,
)

ax1.boxplot(x=df_cx[df_cx["kmeans_name"] == "unsatisfied"]["rating_avg"].values, vert=False, showmeans=True, widths=0.6)
ax2.boxplot(x=df_cx["rating_avg"].values, vert=False, showmeans=True, widths=0.6)


###
# Titles/Lables
ax1.set(yticklabels=[])
ax1.set(title="Repartition of ratings, unsatisfied customers")
ax2.set(yticklabels=[])
ax2.set(title="Repartition of ratings, dataset wide")
#
###

fig.tight_layout()
plt.show()

Observation :¶

 No surprise here, it was expected from unsat. cluster to have ratings that are way lower in contrast to the rest of the dataset. It looks like the split we did (people that rated at most 2.5 in average, see #2).
 We expect higher delta days between expected and actual delivery. We will take a look at distance from seller as well.

In [ ]:
# Using the same code from #2 : 

amount_eot_all = len(df_cx[df_cx["early_on_time"] == True])
amount_eot_unsat = len(df_cx[(df_cx["early_on_time"] == True) & (df_cx["kmeans_name"] == "unsatisfied")])

percentage_all = (amount_eot_all / len(df_cx)) * 100
percentage_unsat = (amount_eot_unsat / len(df_cx[df_cx["kmeans_name"] == "unsatisfied"])) * 100

print(f"There is {percentage_all}% of who have been delivered on time on the whole dataset")
print(f"There is {percentage_unsat}% of who have been delivered on time or early in the unsat group")

print(f"""
In total, {amount_eot_unsat} orders have been delivered on time or early in the unsat group,\
 group size is {len(df_cx[df_cx["kmeans_name"] == "unsatisfied"])}
""")
There is 91.14852534290928% of who have been delivered on time on the whole dataset
There is 0.0709622480840193% of who have been delivered on time or early in the unsat group

In total, 5 orders have been delivered on time or early in the unsat group, group size is 7046

Observation :¶

 It is safe to say that the unsatisfied customers are the ones who have been impacted by delays the most. Recommendations for this group, considering the data we have, is to offer an incentive/a compensation for the delay issue, and to address it as soon as possible to avoid general insatisfaction.

Conclusion :¶

 With these groups identified, it is possible to give a reliable explanation of which client is in which group. Maintenance will be done on the PCA dataset exported earlier to avoid differences. We can export the dataset containing client important info as an output, both pickle and csv will be used to maximize compatibility.


Recommendations :

  • Target the dissatisfied group specifically with incentives/compensations and ensure that differences between estimated and actual delivery will be kept to a minimum, or to a negative (early delivery)
  • Majority 2 and New Customers do not necessitate any particular action, they are satisfied by their experience on Olist.
  • Majority 1's, renamed can_shift, delivery times must be kept low, actions recommended on the dissatisfied group are also recommended for this group, although not with the same intensity.
In [ ]:
df_cx.to_pickle(path="../pickles/final_clustering.pkl")
df_cx.to_csv(path_or_buf="../data/final/final_clustering.csv", index=False)